library(dplyr)
library(tidyverse)
library(ggplot2)
library(kableExtra)

1 Data

data <- read.csv("/home/riya/BRN/gapminder_clean.csv") %>%
  as.tibble()
head(data)
## # A tibble: 6 × 20
##       X Country.Name  Year Agriculture..value.added....…¹ CO2.emissions..metri…²
##   <int> <chr>        <int>                          <dbl>                  <dbl>
## 1     0 Afghanistan   1962                             NA                 0.0738
## 2     1 Afghanistan   1967                             NA                 0.124 
## 3     2 Afghanistan   1972                             NA                 0.131 
## 4     3 Afghanistan   1977                             NA                 0.183 
## 5     4 Afghanistan   1982                             NA                 0.166 
## 6     5 Afghanistan   1987                             NA                 0.276 
## # ℹ abbreviated names: ¹​Agriculture..value.added....of.GDP.,
## #   ²​CO2.emissions..metric.tons.per.capita.
## # ℹ 15 more variables:
## #   Domestic.credit.provided.by.financial.sector....of.GDP. <dbl>,
## #   Electric.power.consumption..kWh.per.capita. <dbl>,
## #   Energy.use..kg.of.oil.equivalent.per.capita. <dbl>,
## #   Exports.of.goods.and.services....of.GDP. <dbl>, …

2 Filtering the data to include only rows where Year is 1962

# filtering the data to include rows where Year is equal to 1962
filtered_data1 <- data %>%
  filter(Year == 1962)

2.1 Scatterplot

filtered_data1 %>%
  ggplot(aes(x = CO2.emissions..metric.tons.per.capita., y = gdpPercap)) +
  geom_point() #+

# ggsave("scatterPlot.png",path="/home/riya/BRN/Plots")

2.2 Correlation

cor_res <- cor.test(filtered_data1$CO2.emissions..metric.tons.per.capita., filtered_data1$gdpPercap)
cor_res
## 
##  Pearson's product-moment correlation
## 
## data:  filtered_data1$CO2.emissions..metric.tons.per.capita. and filtered_data1$gdpPercap
## t = 25.269, df = 106, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8934697 0.9489792
## sample estimates:
##       cor 
## 0.9260817
cor_res$p.value
## [1] 1.128679e-46

The correlation coefficient of approximately 0.9261 suggests a very strong positive linear relationship between “CO2 emissions (metric tons per capita)” and “GDP per capita”. The confidence interval between 0.8934697 and 0.9489792 further supports this correlation.


3 Year with strongest correlation between ’CO2 emissions (metric tons per capita)’and gdpPercap

data %>%
  filter(complete.cases(CO2.emissions..metric.tons.per.capita., gdpPercap)) %>%
  group_by(Year) %>%
  summarise(cor = cor(CO2.emissions..metric.tons.per.capita., gdpPercap)) %>%
  slice(which.max(cor))%>%
  kbl()%>%
  kable_material()
Year cor
1967 0.9387918

Max correlation between Co2 emissions and gdpPercapita was observed in 1967. Hence, subsetting the data to only include rows corresponding to year 1967.

filtered_data2 <- data %>%
  filter(Year == 1967)

kbl(head(filtered_data2, 5)) %>%
  kable_material(c("striped", "hover"))%>%
  scroll_box(width="830px")
X Country.Name Year Agriculture..value.added….of.GDP. CO2.emissions..metric.tons.per.capita. Domestic.credit.provided.by.financial.sector….of.GDP. Electric.power.consumption..kWh.per.capita. Energy.use..kg.of.oil.equivalent.per.capita. Exports.of.goods.and.services….of.GDP. Fertility.rate..total..births.per.woman. GDP.growth..annual… Imports.of.goods.and.services….of.GDP. Industry..value.added….of.GDP. Inflation..GDP.deflator..annual… Life.expectancy.at.birth..total..years. Population.density..people.per.sq..km.of.land.area. Services..etc…value.added….of.GDP. pop continent gdpPercap
1 Afghanistan 1967 NA 0.1237824 9.917662 NA NA 6.772908 7.450 NA 14.20983 NA NA 35.38941 15.881812 NA 11537966 Asia 836.1971
11 Albania 1967 NA 1.3637463 NA NA NA NA 5.394 NA NA NA NA 66.28722 71.737153 NA 1984060 Europe 2760.1969
21 Algeria 1967 10.33067 0.6321184 27.977088 NA NA 23.434417 7.672 9.452963 21.63177 42.38589 1.312041 49.18751 5.606908 47.28345 12760499 Africa 3246.9918
31 American Samoa 1967 NA NA NA NA NA NA NA NA NA NA NA NA 125.580000 NA NA NA
41 Andorra 1967 NA NA NA NA NA NA NA NA NA NA NA NA 44.159574 NA NA NA

3.1 Scatter plot comparing ’CO2 emissions (metric tons per capita)’and gdpPercap

library(plotly)
p <- filtered_data2 %>%
  ggplot(aes(x = CO2.emissions..metric.tons.per.capita., y = gdpPercap, color = continent)) +
  geom_point(aes(pop))

ggplotly(p)
# ggsave("emVsgdp_scatterplot.png",plot=p,path ="/home/riya/BRN/Plots" )

4 Statistical tests to determine relationship between continent and ‘Energy use (kg of oil equivalent per capita)’

# plotting a boxplot to visualise the relationship between these variables
data %>%
  ggplot(aes(x = continent, y = Energy.use..kg.of.oil.equivalent.per.capita.)) +
  geom_boxplot()

# ggsave("boxplot.png",path="/home/riya/BRN/Plots")

Here, from above plot there seems to some differences in the energy across different continents, particularly - Asia, Europe and Oceania(highest median observed for Oceania). We will test significance of these differences statistically using ANOVA test.

aov_model <- aov(data$Energy.use..kg.of.oil.equivalent.per.capita. ~ data$continent)
summary(aov_model)
##                  Df    Sum Sq   Mean Sq F value Pr(>F)    
## data$continent    5 8.124e+08 162482656   21.88 <2e-16 ***
## Residuals      1404 1.043e+10   7426183                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1197 observations deleted due to missingness

Here, the observed p-value is very small(<2e-16) and provides a strong evidence to reject null hypothesis. This indicates statistically significant differences in the energy use across the continents.


5 Is there a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990?

# density plot to visualise the differences in imports of goods and services in two continents
data %>%
  filter(Year > 1990 & continent %in% c("Europe", "Asia")) %>%
  ggplot(aes(x = Imports.of.goods.and.services....of.GDP., fill = continent)) +
  geom_density(alpha = 0.3) +
  labs(title = "Imports of goods and services between Europe and Asia")

# stats
my_Data <- data %>%
  filter(Year > 1990) %>%
  select(continent, Imports.of.goods.and.services....of.GDP.) %>%
  filter(continent %in% c("Europe", "Asia"))
t.test(Imports.of.goods.and.services....of.GDP. ~ continent, my_Data)
## 
##  Welch Two Sample t-test
## 
## data:  Imports.of.goods.and.services....of.GDP. by continent
## t = 1.3552, df = 137.53, p-value = 0.1776
## alternative hypothesis: true difference in means between group Asia and group Europe is not equal to 0
## 95 percent confidence interval:
##  -2.321099 12.433240
## sample estimates:
##   mean in group Asia mean in group Europe 
##             46.84531             41.78924

Based on the results, the p-value of 0.1776 is greater than the typical significance level of 0.05. This means we cannot reject the null hypothesis indicating there is no significant difference in import of goods and services between Asia and Europe.


6 What is the country (or countries) that has the highest ‘Population density (people per sq. km of land area)’ across all years?

data %>%
  group_by(Country.Name) %>%
  summarise(mean = mean(Population.density..people.per.sq..km.of.land.area.)) %>%
  slice(which.max(mean)) %>%
  kbl() %>%
  kable_material("striped")
Country.Name mean
Macao SAR, China 14732.04

China has the highest ‘Population density (people per sq. km of land area)’ across all years.


7 What country (or countries) has shown the greatest increase in ‘Life expectancy at birth, total (years)’ between 1962 and 2007?

glexpData <- data %>%
  filter(Year %in% c(1962, 2007)) %>%
  select(Year, Country.Name, Life.expectancy.at.birth..total..years.) %>%
  group_by(Country.Name) %>%
  pivot_wider(names_from = Year, values_from = Life.expectancy.at.birth..total..years.) %>%
  mutate(diff_LE = `2007` - `1962`) %>%
  arrange(desc(diff_LE))

kbl(head(glexpData, 5)) %>%
  kable_material(c("striped", "hover"))
Country.Name 1962 2007 diff_LE
Maldives 38.48356 75.39971 36.91615
Bhutan 33.09415 66.29310 33.19895
Timor-Leste 34.73905 65.82420 31.08515
Tunisia 43.34168 74.20244 30.86076
Oman 44.30051 75.12361 30.82310

Maldives saw greatest increase in Life expectancy at birth between year 1962 and 2007.